This is analysis for proteomics dataset generated using mouse scraped colon epithelium samples from Fabp-Cre;KRasWT and Fabp-Cre;KRasG12D mice. Emily Poulin harvested the tissue samples and the proteomics were performed by Joao Paulo. Part of the analysis code was adapted from original script from Shikha Sheth.
options(connectionObserver = NULL)
suppressMessages(
c(library(gridExtra),
library(ensembldb),
library(EnsDb.Mmusculus.v79),
library(grid),
library(ggplot2),
library(lattice),
library(reshape),
library(mixOmics),
library(gplots),
library(RColorBrewer),
library(readr),
library(dplyr),
library(VennDiagram),
library(clusterProfiler),
library(DOSE),
library(org.Mm.eg.db),
library(pathview),
library(AnnotationDbi),
library(tidyr),
library(qdapRegex),
library(gtools),
library(ggfortify))
)
## [1] "gridExtra" "stats" "graphics"
## [4] "grDevices" "utils" "datasets"
## [7] "methods" "base" "ensembldb"
## [10] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [13] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [16] "IRanges" "S4Vectors" "stats4"
## [19] "BiocGenerics" "parallel" "gridExtra"
## [22] "stats" "graphics" "grDevices"
## [25] "utils" "datasets" "methods"
## [28] "base" "EnsDb.Mmusculus.v79" "ensembldb"
## [31] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [34] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [37] "IRanges" "S4Vectors" "stats4"
## [40] "BiocGenerics" "parallel" "gridExtra"
## [43] "stats" "graphics" "grDevices"
## [46] "utils" "datasets" "methods"
## [49] "base" "grid" "EnsDb.Mmusculus.v79"
## [52] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [55] "AnnotationDbi" "Biobase" "GenomicRanges"
## [58] "GenomeInfoDb" "IRanges" "S4Vectors"
## [61] "stats4" "BiocGenerics" "parallel"
## [64] "gridExtra" "stats" "graphics"
## [67] "grDevices" "utils" "datasets"
## [70] "methods" "base" "ggplot2"
## [73] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [76] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [79] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [82] "IRanges" "S4Vectors" "stats4"
## [85] "BiocGenerics" "parallel" "gridExtra"
## [88] "stats" "graphics" "grDevices"
## [91] "utils" "datasets" "methods"
## [94] "base" "lattice" "ggplot2"
## [97] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [100] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [103] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [106] "IRanges" "S4Vectors" "stats4"
## [109] "BiocGenerics" "parallel" "gridExtra"
## [112] "stats" "graphics" "grDevices"
## [115] "utils" "datasets" "methods"
## [118] "base" "reshape" "lattice"
## [121] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [124] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [127] "AnnotationDbi" "Biobase" "GenomicRanges"
## [130] "GenomeInfoDb" "IRanges" "S4Vectors"
## [133] "stats4" "BiocGenerics" "parallel"
## [136] "gridExtra" "stats" "graphics"
## [139] "grDevices" "utils" "datasets"
## [142] "methods" "base" "mixOmics"
## [145] "MASS" "reshape" "lattice"
## [148] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [151] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [154] "AnnotationDbi" "Biobase" "GenomicRanges"
## [157] "GenomeInfoDb" "IRanges" "S4Vectors"
## [160] "stats4" "BiocGenerics" "parallel"
## [163] "gridExtra" "stats" "graphics"
## [166] "grDevices" "utils" "datasets"
## [169] "methods" "base" "gplots"
## [172] "mixOmics" "MASS" "reshape"
## [175] "lattice" "ggplot2" "grid"
## [178] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [181] "GenomicFeatures" "AnnotationDbi" "Biobase"
## [184] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [187] "S4Vectors" "stats4" "BiocGenerics"
## [190] "parallel" "gridExtra" "stats"
## [193] "graphics" "grDevices" "utils"
## [196] "datasets" "methods" "base"
## [199] "RColorBrewer" "gplots" "mixOmics"
## [202] "MASS" "reshape" "lattice"
## [205] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [208] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [211] "AnnotationDbi" "Biobase" "GenomicRanges"
## [214] "GenomeInfoDb" "IRanges" "S4Vectors"
## [217] "stats4" "BiocGenerics" "parallel"
## [220] "gridExtra" "stats" "graphics"
## [223] "grDevices" "utils" "datasets"
## [226] "methods" "base" "readr"
## [229] "RColorBrewer" "gplots" "mixOmics"
## [232] "MASS" "reshape" "lattice"
## [235] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [238] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [241] "AnnotationDbi" "Biobase" "GenomicRanges"
## [244] "GenomeInfoDb" "IRanges" "S4Vectors"
## [247] "stats4" "BiocGenerics" "parallel"
## [250] "gridExtra" "stats" "graphics"
## [253] "grDevices" "utils" "datasets"
## [256] "methods" "base" "dplyr"
## [259] "readr" "RColorBrewer" "gplots"
## [262] "mixOmics" "MASS" "reshape"
## [265] "lattice" "ggplot2" "grid"
## [268] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [271] "GenomicFeatures" "AnnotationDbi" "Biobase"
## [274] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [277] "S4Vectors" "stats4" "BiocGenerics"
## [280] "parallel" "gridExtra" "stats"
## [283] "graphics" "grDevices" "utils"
## [286] "datasets" "methods" "base"
## [289] "VennDiagram" "futile.logger" "dplyr"
## [292] "readr" "RColorBrewer" "gplots"
## [295] "mixOmics" "MASS" "reshape"
## [298] "lattice" "ggplot2" "grid"
## [301] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [304] "GenomicFeatures" "AnnotationDbi" "Biobase"
## [307] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [310] "S4Vectors" "stats4" "BiocGenerics"
## [313] "parallel" "gridExtra" "stats"
## [316] "graphics" "grDevices" "utils"
## [319] "datasets" "methods" "base"
## [322] "clusterProfiler" "VennDiagram" "futile.logger"
## [325] "dplyr" "readr" "RColorBrewer"
## [328] "gplots" "mixOmics" "MASS"
## [331] "reshape" "lattice" "ggplot2"
## [334] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [337] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [340] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [343] "IRanges" "S4Vectors" "stats4"
## [346] "BiocGenerics" "parallel" "gridExtra"
## [349] "stats" "graphics" "grDevices"
## [352] "utils" "datasets" "methods"
## [355] "base" "DOSE" "clusterProfiler"
## [358] "VennDiagram" "futile.logger" "dplyr"
## [361] "readr" "RColorBrewer" "gplots"
## [364] "mixOmics" "MASS" "reshape"
## [367] "lattice" "ggplot2" "grid"
## [370] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [373] "GenomicFeatures" "AnnotationDbi" "Biobase"
## [376] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [379] "S4Vectors" "stats4" "BiocGenerics"
## [382] "parallel" "gridExtra" "stats"
## [385] "graphics" "grDevices" "utils"
## [388] "datasets" "methods" "base"
## [391] "org.Mm.eg.db" "DOSE" "clusterProfiler"
## [394] "VennDiagram" "futile.logger" "dplyr"
## [397] "readr" "RColorBrewer" "gplots"
## [400] "mixOmics" "MASS" "reshape"
## [403] "lattice" "ggplot2" "grid"
## [406] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [409] "GenomicFeatures" "AnnotationDbi" "Biobase"
## [412] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [415] "S4Vectors" "stats4" "BiocGenerics"
## [418] "parallel" "gridExtra" "stats"
## [421] "graphics" "grDevices" "utils"
## [424] "datasets" "methods" "base"
## [427] "pathview" "org.Mm.eg.db" "DOSE"
## [430] "clusterProfiler" "VennDiagram" "futile.logger"
## [433] "dplyr" "readr" "RColorBrewer"
## [436] "gplots" "mixOmics" "MASS"
## [439] "reshape" "lattice" "ggplot2"
## [442] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [445] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [448] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [451] "IRanges" "S4Vectors" "stats4"
## [454] "BiocGenerics" "parallel" "gridExtra"
## [457] "stats" "graphics" "grDevices"
## [460] "utils" "datasets" "methods"
## [463] "base" "pathview" "org.Mm.eg.db"
## [466] "DOSE" "clusterProfiler" "VennDiagram"
## [469] "futile.logger" "dplyr" "readr"
## [472] "RColorBrewer" "gplots" "mixOmics"
## [475] "MASS" "reshape" "lattice"
## [478] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [481] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [484] "AnnotationDbi" "Biobase" "GenomicRanges"
## [487] "GenomeInfoDb" "IRanges" "S4Vectors"
## [490] "stats4" "BiocGenerics" "parallel"
## [493] "gridExtra" "stats" "graphics"
## [496] "grDevices" "utils" "datasets"
## [499] "methods" "base" "tidyr"
## [502] "pathview" "org.Mm.eg.db" "DOSE"
## [505] "clusterProfiler" "VennDiagram" "futile.logger"
## [508] "dplyr" "readr" "RColorBrewer"
## [511] "gplots" "mixOmics" "MASS"
## [514] "reshape" "lattice" "ggplot2"
## [517] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [520] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [523] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [526] "IRanges" "S4Vectors" "stats4"
## [529] "BiocGenerics" "parallel" "gridExtra"
## [532] "stats" "graphics" "grDevices"
## [535] "utils" "datasets" "methods"
## [538] "base" "qdapRegex" "tidyr"
## [541] "pathview" "org.Mm.eg.db" "DOSE"
## [544] "clusterProfiler" "VennDiagram" "futile.logger"
## [547] "dplyr" "readr" "RColorBrewer"
## [550] "gplots" "mixOmics" "MASS"
## [553] "reshape" "lattice" "ggplot2"
## [556] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [559] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [562] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [565] "IRanges" "S4Vectors" "stats4"
## [568] "BiocGenerics" "parallel" "gridExtra"
## [571] "stats" "graphics" "grDevices"
## [574] "utils" "datasets" "methods"
## [577] "base" "gtools" "qdapRegex"
## [580] "tidyr" "pathview" "org.Mm.eg.db"
## [583] "DOSE" "clusterProfiler" "VennDiagram"
## [586] "futile.logger" "dplyr" "readr"
## [589] "RColorBrewer" "gplots" "mixOmics"
## [592] "MASS" "reshape" "lattice"
## [595] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [598] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [601] "AnnotationDbi" "Biobase" "GenomicRanges"
## [604] "GenomeInfoDb" "IRanges" "S4Vectors"
## [607] "stats4" "BiocGenerics" "parallel"
## [610] "gridExtra" "stats" "graphics"
## [613] "grDevices" "utils" "datasets"
## [616] "methods" "base" "ggfortify"
## [619] "gtools" "qdapRegex" "tidyr"
## [622] "pathview" "org.Mm.eg.db" "DOSE"
## [625] "clusterProfiler" "VennDiagram" "futile.logger"
## [628] "dplyr" "readr" "RColorBrewer"
## [631] "gplots" "mixOmics" "MASS"
## [634] "reshape" "lattice" "ggplot2"
## [637] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [640] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [643] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [646] "IRanges" "S4Vectors" "stats4"
## [649] "BiocGenerics" "parallel" "gridExtra"
## [652] "stats" "graphics" "grDevices"
## [655] "utils" "datasets" "methods"
## [658] "base"
The original dataset was loaded.
colon_proteom <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/scraped colon/2015-03_HaigisMouseColon8plex_Prot.csv")
## Warning: Missing column names filled in: 'X2' [2]
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## `Protein Id` = col_character(),
## X2 = col_character(),
## `Gene Symbol` = col_character(),
## Description = col_character(),
## FABP_1 = col_double(),
## FABP_2 = col_double(),
## FABP_4 = col_double(),
## FABP_5 = col_double(),
## G12D_1 = col_double(),
## G12D_2 = col_double(),
## G12D_3 = col_double(),
## G12D_4 = col_double()
## )
# establish a new data frame for collecting stats
colon_stats <- colon_proteom[,1:3]
# Calculate the pvalue using parametric unpaired t test
p_value_list <- c()
for (i in 1:dim(colon_proteom)[1]) {
p_value <- t.test(unlist(colon_proteom[i,9:12]), unlist(colon_proteom[i,5:8]), paired = FALSE)$p.value
p_value_list <- c(p_value_list, p_value)
}
colon_stats <- cbind(colon_stats, p_value_list)
colnames(colon_stats)[4] <- "p_values"
# calculate the q value using Benjamini Hochberg FDR correction
q_value_list <- p.adjust(colon_stats$p_values, method = "BH")
colon_stats <- cbind(colon_stats, q_value_list)
colnames(colon_stats)[5] <- "q_values"
# calculate fold change and log fold change
foldchange_list <- c()
for (i in 1:dim(colon_proteom)[1]) {
foldchange <- foldchange(mean(unlist(colon_proteom[i,9:12])), mean(unlist(colon_proteom[i,5:8])))
foldchange_list <- c(foldchange_list, foldchange)
}
logfoldchange_list <- foldchange2logratio(foldchange_list)
colon_stats <- cbind(colon_stats, foldchange_list, logfoldchange_list)
colnames(colon_stats)[6:7] <- c("foldChange", "LFC")
Now we output this statistical analysis file into a csv file.
write.csv(colon_stats, "ceMS_diff.csv", col.names = NULL)
Just to check how many siginificant proteins do we have based on p< 0.05 and q< 0.1
sig_dif_stats <- subset(colon_stats, colon_stats$p_values <= 0.05 & colon_stats$q_values <= 0.1)
dim(sig_dif_stats)[1]
## [1] 2348
So there are 2348 proteins identified to have significantly different expression between G12D/WT.
Since the number of significant changes are quite small, I want to use PCA to check how the samples cluster.
dir.create("PDF_figure", showWarnings = FALSE)
df <- colon_proteom[5:12]
df <- as.data.frame(t(df))
df <- cbind(df, c('WT','WT','WT','WT','G12D','G12D','G12D','G12D'))
colnames(df)[8162] <- 'Genotype'
df.set <- as.matrix(df[,1:8161])
df.pca <- prcomp(df.set, center = TRUE, scale = TRUE)
autoplot(df.pca, data = df, colour = 'Genotype') +
geom_text(aes(label=rownames(df)), vjust = 2, hjust = -0.1) +
xlim(-0.5, 0.6) + ylim(-0.7, 0.6)
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## Please use `select()` instead.
pdf('PDF_figure/PCA.pdf',
height = 4,
width = 6)
autoplot(df.pca, data = df, colour = 'Genotype') +
geom_text(aes(label=rownames(df)), vjust = 2, hjust = -0.1) +
xlim(-0.5, 0.6) + ylim(-0.7, 0.6)
dev.off()
## quartz_off_screen
## 2
pseudoCount = log2(colon_proteom[5:12])
# remove NA, NaN, Inf values from the dataframe
pseudoCount <- na.omit(pseudoCount)
pseudoCount <- pseudoCount[is.finite(rowSums(pseudoCount)),]
mat.dist = pseudoCount
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/scraped colon")
png('Hierchical_Clustering.png')
cim(mat.dist, symkey = FALSE, margins = c(10, 10 ))
suppressMessages(dev.off())
## quartz_off_screen
## 2
pdf('PDF_figure/Hierchical_Clustering.pdf')
cim(mat.dist, symkey = FALSE, margins = c(7, 7))
dev.off()
## quartz_off_screen
## 2
Final output is following:
For heatmap, I will z-score all quantifications across all samples for the same protein. Heatmaps for all proteins with p<0.05 and q < 0.1 are plotted
suppressMessages(library(mosaic))
sig_index <- c()
duplicate <- c()
for (i in 1:dim(sig_dif_stats)[1]) {
index <- grep(sig_dif_stats$`Protein Id`[i], colon_proteom$`Protein Id`)
if (length(index) == 1) {
sig_index <- c(sig_index, index)
}
else {
duplicate <- c(duplicate, i)
}
}
sig_count <- colon_proteom[sig_index,]
sig_dif <- cbind(sig_dif_stats[-duplicate,], sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,12:19] <- zscore(as.numeric(sig_dif[i,12:19]))
}
my_palette <- colorRampPalette(c("red", "white", "blue"))(256)
heatmap_matrix <- as.matrix(sig_dif[,12:19])
png('G12D vs WT colon proteomics.png',
width = 600,
height = 1400,
res = 200,
pointsize = 8)
par(cex.main=1.1)
heatmap.2(heatmap_matrix,
main = "DE proteins\nin colon epithelium\n(G12D/WT) p < 0.05 q < 0.1",
density.info = "none",
key = TRUE,
lwid = c(3,7),
lhei = c(1,7),
col=my_palette,
margins = c(8,2),
symbreaks = TRUE,
trace = "none",
dendrogram = "row",
labRow = FALSE,
ylab = "Proteins",
cexCol = 1.5,
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/Heatmap.pdf',
width = 6,
height = 14)
heatmap.2(heatmap_matrix,
main = "DE proteins\nin colon epithelium\n(G12D/WT) p < 0.05 q < 0.1",
density.info = "none",
key = TRUE,
lwid = c(3,7),
lhei = c(1,7),
col=my_palette,
margins = c(8,2),
symbreaks = TRUE,
trace = "none",
dendrogram = "row",
labRow = FALSE,
ylab = "Proteins",
cexCol = 1.5,
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
Final output is
# Scatter plot
colon_stats$KrasG12D_mean <- rowMeans(log2(colon_proteom[,9:12]))
colon_stats$KrasWT_mean <- rowMeans(log2(colon_proteom[,5:8]))
ggplot(colon_stats, aes(x = KrasWT_mean, y = KrasG12D_mean)) +
xlab("WT_Average(log2)") + ylab("G12D_Average(log2)") +
geom_point(data = colon_stats, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(colon_stats,colon_stats$p_values < 0.05 & colon_stats$q_values < 0.1 & colon_stats$LFC > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(colon_stats,colon_stats$p_values < 0.05 & colon_stats$q_values < 0.1 & colon_stats$LFC < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "G12D vs WT Scatter Plot")
pdf('PDF_figure/Scatter_Plot.pdf',
width = 5,
height = 4)
ggplot(colon_stats, aes(x = KrasWT_mean, y = KrasG12D_mean)) +
xlab("WT_Average(log2)") + ylab("G12D_Average(log2)") +
geom_point(data = colon_stats, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(colon_stats,colon_stats$p_values < 0.05 & colon_stats$q_values < 0.1 & colon_stats$LFC > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(colon_stats,colon_stats$p_values < 0.05 & colon_stats$q_values < 0.1 & colon_stats$LFC < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "G12D vs WT Scatter Plot")
dev.off()
## quartz_off_screen
## 2
# MA plot
colon_stats$'baseMean' <- rowMeans(colon_proteom[,5:12])
colon_stats$'log2baseMean' <- log(colon_stats$`baseMean`,2)
red_subset <- subset(colon_stats,colon_stats$p_values < 0.05 & colon_stats$q_values < 0.1 & colon_stats$LFC > 0)
blue_subset <- subset(colon_stats,colon_stats$p_values < 0.05 & colon_stats$q_values < 0.1 & colon_stats$LFC < 0)
ggplot(colon_stats, aes(x = colon_stats$`log2baseMean`, y = colon_stats$`LFC`)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = colon_stats, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = red_subset, aes(x=red_subset$`log2baseMean`, y=red_subset$`LFC`), alpha = 0.5, size = 1, color = "red") +
geom_point(data = blue_subset, aes(x=blue_subset$`log2baseMean`, y=blue_subset$`LFC`), alpha = 0.5, size = 1, color = "blue") +
labs(title = "G12D vs WT MA Plot")
## Warning: Use of `colon_stats$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `colon_stats$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `red_subset$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `red_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `blue_subset$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `blue_subset$LFC` is discouraged. Use `LFC` instead.
pdf('PDF_figure/MA_Plot.pdf',
width = 5,
height = 4)
ggplot(colon_stats, aes(x = colon_stats$`log2baseMean`, y = colon_stats$`LFC`)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = colon_stats, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = red_subset, aes(x=red_subset$`log2baseMean`, y=red_subset$`LFC`), alpha = 0.5, size = 1, color = "red") +
geom_point(data = blue_subset, aes(x=blue_subset$`log2baseMean`, y=blue_subset$`LFC`), alpha = 0.5, size = 1, color = "blue") +
labs(title = "G12D vs WT MA Plot")
## Warning: Use of `colon_stats$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `colon_stats$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `red_subset$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `red_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `blue_subset$log2baseMean` is discouraged. Use `log2baseMean`
## instead.
## Warning: Use of `blue_subset$LFC` is discouraged. Use `LFC` instead.
dev.off()
## quartz_off_screen
## 2
# Volcano Plot
ggplot(colon_stats, aes(x = colon_stats$`LFC`, y = -log(colon_stats$`p_values`,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = colon_stats, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = red_subset, aes(x=red_subset$`LFC`, y=-log(red_subset$`p_values`,10)), alpha = 0.5, size = 1, color = "red") +
geom_point(data = blue_subset, aes(x=blue_subset$`LFC`, y=-log(blue_subset$`p_values`,10)), alpha = 0.5, size = 1, color = "blue") +
labs(title = "G12D vs WT Volcano Plot")
## Warning: Use of `colon_stats$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `colon_stats$p_values` is discouraged. Use `p_values` instead.
## Warning: Use of `red_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `red_subset$p_values` is discouraged. Use `p_values` instead.
## Warning: Use of `blue_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `blue_subset$p_values` is discouraged. Use `p_values` instead.
pdf('PDF_figure/Volcano_Plot.pdf',
width = 5,
height = 4)
ggplot(colon_stats, aes(x = colon_stats$`LFC`, y = -log(colon_stats$`p_values`,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = colon_stats, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = red_subset, aes(x=red_subset$`LFC`, y=-log(red_subset$`p_values`,10)), alpha = 0.5, size = 1, color = "red") +
geom_point(data = blue_subset, aes(x=blue_subset$`LFC`, y=-log(blue_subset$`p_values`,10)), alpha = 0.5, size = 1, color = "blue") +
labs(title = "G12D vs WT Volcano Plot")
## Warning: Use of `colon_stats$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `colon_stats$p_values` is discouraged. Use `p_values` instead.
## Warning: Use of `red_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `red_subset$p_values` is discouraged. Use `p_values` instead.
## Warning: Use of `blue_subset$LFC` is discouraged. Use `LFC` instead.
## Warning: Use of `blue_subset$p_values` is discouraged. Use `p_values` instead.
dev.off()
## quartz_off_screen
## 2
For the Go analysis, I am using all proteins that have a p<0.05 and q < 0.1.
target_gene <- as.character(sig_dif_stats$`Protein Id`)
detected_gene <- as.character(colon_proteom$`Protein Id`)
# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene,
universe = detected_gene,
keyType = "UNIPROT",
OrgDb = org.Mm.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)
dim(cluster_summary_BP)[1]
## [1] 156
write.csv(cluster_summary_BP, "GO/GO analysis_BP.csv")
# Run GO enrichment analysis for molecular function
ego_MF <- enrichGO(gene = target_gene,
universe = detected_gene,
keyType = "UNIPROT",
OrgDb = org.Mm.eg.db,
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)
dim(cluster_summary_MF)[1]
## [1] 22
write.csv(cluster_summary_MF, "GO/GO analysis_MF.csv")
# Run GO enrichment analysis for cellular component
ego_CC <- enrichGO(gene = target_gene,
universe = detected_gene,
keyType = "UNIPROT",
OrgDb = org.Mm.eg.db,
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)
dim(cluster_summary_CC)[1]
## [1] 24
write.csv(cluster_summary_CC, "GO/GO analysis_CC.csv")
png('GO/GO dotplot_BP.png',
width = 2000,
height = 1600,
res = 100,
pointsize = 8)
dotplot(ego_BP, showCategory=50)
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO dotplot_BP.pdf',
width = 20,
height = 16)
dotplot(ego_BP, showCategory=50)
dev.off()
## quartz_off_screen
## 2
Final output is following: #### Molecular Function
png('GO/GO dotplot_MF.png',
width = 800,
height = 800,
res = 100,
pointsize = 8)
dotplot(ego_MF, showCategory=50)
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO dotplot_MF.pdf',
width = 8,
height = 8)
dotplot(ego_MF, showCategory=50)
dev.off()
## quartz_off_screen
## 2
Final output is following: #### Cellular Component
png('GO/GO dotplot_CC.png',
width = 800,
height = 800,
res = 100,
pointsize = 8)
dotplot(ego_CC, showCategory=50)
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO dotplot_CC.pdf',
width = 8,
height = 8)
dotplot(ego_CC, showCategory=50)
dev.off()
## quartz_off_screen
## 2
Final output is following:
png('GO/GO enrichment_BP.png',
width = 1600,
height = 1600,
res = 100,
pointsize = 8)
emapplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO enrichment_BP.pdf',
width = 16,
height = 16)
emapplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
Final output is following: #### Molecular Function
png('GO/GO enrichment_MF.png',
width = 1000,
height = 1000,
res = 100,
pointsize = 8)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO enrichment_MF.pdf',
width = 10,
height = 10)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
Final output is following: #### Cellular Component
png('GO/GO enrichment_CC.png',
width = 1000,
height = 1000,
res = 100,
pointsize = 8)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO enrichment_CC.pdf',
width = 10,
height = 10)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
Final output is following:
The category netplot shows the relationships between the genes associated with the top five most significant GO terms and the fold changes of the significant genes associated with these terms (color).
OE_foldchanges <- colon_stats$LFC
names(OE_foldchanges) <- colon_stats$`Gene Symbol`
png('GO/GO cnetplot_BP.png',
width = 1600,
height = 1600,
res = 100,
pointsize = 8)
cnetplot(ego_BP,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO cnetplot_BP.pdf',
width = 16,
height = 16)
cnetplot(ego_BP,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
dev.off()
## quartz_off_screen
## 2
Final output is following: #### Molecular Function
png('GO/GO cnetplot_MF.png',
width = 1600,
height = 1600,
res = 100,
pointsize = 8)
cnetplot(ego_MF,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO cnetplot_MF.pdf',
width = 16,
height = 16)
cnetplot(ego_MF,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
## Warning: ggrepel: 3 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
dev.off()
## quartz_off_screen
## 2
Final output is following: #### Cellular Component
png('GO/GO cnetplot_CC.png',
width = 1600,
height = 1600,
res = 100,
pointsize = 8)
cnetplot(ego_CC,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
pdf('PDF_figure/GO cnetplot_CC.pdf',
width = 16,
height = 16)
cnetplot(ego_CC,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
Final output is following:
First I need to annotate quantification file with Ensembl ID.
# annotate the tumor stats with gene symbol
annotations_edb <- AnnotationDbi::select(org.Mm.eg.db,
keys = colon_proteom$`Protein Id`,
columns = c("ENSEMBL"),
keytype = "UNIPROT")
## 'select()' returned 1:many mapping between keys and columns
# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_edb$UNIPROT) == FALSE)
non_duplicates_idx <- which(duplicated(annotations_edb$ENSEMBL) == FALSE)
non_duplicates_idx <- which(is.na(annotations_edb$ENSEMBL) == FALSE)
# Return only the non-duplicated genes using indices
annotations_edb <- annotations_edb[non_duplicates_idx, ]
# Check number of NAs returned
is.na(annotations_edb$ENSEMBL) %>%
which() %>%
length()
## [1] 0
# Join the Ensembl annotation to proteomics quantification
colon_proteom <- inner_join(colon_proteom, annotations_edb, by=c("Protein Id"="UNIPROT"))
write.csv(colon_proteom, "GSEA/Raw Quantification.csv")